load package
Load Pacakges & Set Options
library (tidyverse)
library (tidymodels)
library (palmerpenguins) # penguin dataset
library (gt) # better tables
library (bonsai) # tree-based models
library (conflicted) # function conflicts
tidymodels_prefer () # handle conflicts
conflict_prefer ("penguins" , "palmerpenguins" )
data clean and split
Code
penguins |>
filter (! is.na (sex)) |>
ggplot (aes (x = flipper_length_mm,
y = bill_length_mm,
color = sex,
size = body_mass_g)) +
geom_point (alpha = 0.5 ) +
facet_wrap (~ species)
Prepare & Split Data
# remove rows with missing sex
# exclude year and island
penguins_df <-
penguins |>
drop_na (sex) |>
select (- year, - island)
# set the seed for reproducibility
set.seed (1234 )
# Split the data into train and test sets
# stratify by sex
penguin_split <- initial_split (penguins_df,
strata = sex)
penguin_train <- training (penguin_split)
penguin_test <- testing (penguin_split)
# create folds for cross validation
penguin_folds <- vfold_cv (penguin_train, v = 10 , strata = sex)
Create Recipe
Code
penguin_rec <-
recipe (sex ~ ., data = penguin_train) |>
step_dummy (species)
Specify Model
Code
rlang:: check_installed ("lightgbm" )
bt_bonsai_spec <-
boost_tree (learn_rate = tune (),
stop_iter = tune (),
trees = 100 ) |>
set_engine (engine = "lightgbm" ,
num_leaves = tune ()) |>
set_mode ("classification" )
Build Grid for Tuning
Code
bt_bonsai_spec |>
extract_parameter_set_dials ()
Collection of 3 parameters for tuning
identifier type object
learn_rate learn_rate nparam[+]
stop_iter stop_iter nparam[+]
num_leaves num_leaves nparam[+]
Code
Learning Rate (quantitative)
Transformer: log-10 [1e-100, Inf]
Range (transformed scale): [-10, -1]
Code
# Iterations Before Stopping (quantitative)
Range: [3, 20]
Code
Number of Leaves (quantitative)
Range: [5, 100]
Build Grid for Tuning
Code
grid_tune <-
bt_bonsai_spec |>
extract_parameter_set_dials () |>
grid_latin_hypercube (size = 50 )
Code
grid_tune |> glimpse (width = 50 )
Rows: 50
Columns: 3
$ learn_rate <dbl> 4.499336e-02, 3.174162e-05, 9…
$ stop_iter <int> 7, 17, 11, 8, 8, 13, 5, 20, 9…
$ num_leaves <int> 81, 39, 34, 74, 22, 79, 46, 8…
Fit Models & Tune Hyperparameters
Construct our workflow
Code
bt_bonsai_wf <-
workflow () |>
add_recipe (penguin_rec) |>
add_model (bt_bonsai_spec)
Specify the grid control parameters
Code
cntl <- control_grid (save_pred = TRUE ,
save_workflow = TRUE )
Fit Models & Tune Hyperparameters
Code
start_time= Sys.time ()
bt_tune_grid <-
bt_bonsai_wf |>
tune_grid (
resamples = penguin_folds,
grid = grid_tune,
control = cntl
)
end_time= Sys.time ()
time= end_time- start_time
time
Time difference of 15.73666 secs
Racing with {finetune}
Code
library (finetune)
race_cntl <- control_race (save_pred = TRUE ,
save_workflow = TRUE )
Racing with {finetune}
Code
library (finetune)
race_cntl <- control_race (save_pred = TRUE ,
save_workflow = TRUE )
bt_tune_race <-
bt_bonsai_wf |>
tune_race_anova (
resamples = penguin_folds,
grid = grid_tune,
control = race_cntl
)
Faster tune with more cores
Code
big_grid <-
bt_bonsai_spec |>
extract_parameter_set_dials () |>
grid_latin_hypercube (size = 250 )
Code
# tune in parallel
parallel:: detectCores (logical = FALSE )
Code
library (doMC)
registerDoMC (cores = 6 )
start_time= Sys.time ()
bt_tune_fast <-
bt_bonsai_wf |>
tune_race_anova (
resamples = penguin_folds,
grid = big_grid,
control = race_cntl
)
end_time= Sys.time ()
time= end_time- start_time
time
Time difference of 12.03298 secs
Code
bt_best_id <-
bt_tune_fast |>
select_best (metric = "roc_auc" )
Code
bt_best_id <-
bt_tune_fast |>
select_best (metric = "roc_auc" )
# extract the best model from the workflow
best_bt_race <-
bt_tune_fast |>
extract_workflow () |>
finalize_workflow (bt_best_id) |>
last_fit (penguin_split)
Code
# collect the metrics for the best model
best_bt_race |>
collect_metrics ()
# A tibble: 2 × 4
.metric .estimator .estimate .config
<chr> <chr> <dbl> <chr>
1 accuracy binary 0.893 Preprocessor1_Model1
2 roc_auc binary 0.973 Preprocessor1_Model1
Code
# plot results of test set fit
best_bt_race |>
collect_predictions () |>
roc_curve (sex, .pred_female) |>
autoplot ()
reference:
https://github.com/JamesHWade/r-mlops/blob/main/hyperparameter-tuning.qmd
https://www.youtube.com/watch?v=IzjmuGJgwKQ